library(xlsx)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(ggplot2)
data=read.xlsx('/Users/hani/Documents/Elec-forecast/Elec-train.xlsx', sheetIndex=1,header=TRUE)
colnames(data) <- c("Time", "Power", "Temp")
Modification of the type of time data from character to POSIXct:
data$Time = as.POSIXct(data$Time, tz="GMT", format = "%m/%d/%Y %H:%M")
Plot of the time series Power Consumption and Temperature:
plot(data$Time,data$Power, type='l')
plot(data$Time, data$Temp,type='l')
Creation of time series objects:
power = ts(data$Power, start= c(1,6), end=c(47,96), freq=96)
temperature = ts(data$Temp, start=c(1,6), end=c(47,96),freq=96)
Creation of train and test dataset:
power_train =window(power,start=c(1,6), end=c(40,96))
power_test = window(power, start=c(41,1), end=c(47,96))
temp_train =window(temperature,start=c(1,6), end=c(40,96))
temp_test = window(temperature, start=c(41,1), end=c(47,96))
Plot of the power time series:
plot(power_train,xlim=c(1,47))
lines(power_test,lty=2)
The trend of the time series suggest that we have a periodicity.
More details with the decomposition plot:
autoplot(decompose(power))
ggtsdisplay(power_train)
The seasonality is confirmed with the shape of the acf plot and decompose plot. The period is 96 which correspond to the total number of observations per day. Furthermore, the seasonality seems to be additive as we do not see a significant evolution of the variance on the plot.
We start by considering an exponential smoothing model.
fit_hw = HoltWinters(power_train,seasonal='additive')
prev_hw= forecast(fit_hw, h=672)
plot(power_train,xlim=c(1,47))
lines(power_test,lty=2)
lines(prev_hw$mean,col=2)
RMSE for Exponential smoothing:
rmse_hw=sqrt(mean((prev_hw$mean-power_test)^2))
rmse_hw
## [1] 20.19866
fit=auto.arima(power_train)
prev=forecast(fit, h=672)
plot(power_train,xlim=c(1,47))
lines(power_test,lty=2)
lines(prev$mean,col=2)
fit
## Series: power_train
## ARIMA(1,0,0)(0,1,0)[96]
##
## Coefficients:
## ar1
## 0.7764
## s.e. 0.0103
##
## sigma^2 estimated as 100.1: log likelihood=-13917.31
## AIC=27838.61 AICc=27838.62 BIC=27851.07
The model generated is a SARIMA (1,0,0)(0,1,0)[96].
RMSE SARIMA without covariate:
rmse_sa = sqrt(mean((prev$mean-power_test)^2))
rmse_sa
## [1] 15.37603
Residuals checking on the SARIMA model automatically generated:
checkresiduals(fit)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0)(0,1,0)[96]
## Q* = 1495.9, df = 191, p-value < 2.2e-16
##
## Model df: 1. Total lags used: 192
ggPacf(fit$residuals)
There are still some autocorrelations on residuals which suggest that they are not totally modelized.
tmp=diff(power_train,lag=96)
ggAcf(tmp)
ggtsdisplay(tmp)
Looking at the acf trend, it seems that there is still a trend.
ggtsdisplay(diff(tmp))
We observe a very significant spike on lag 96 on acf with an exponential decay on pacf seasonal lags. This suggests a seasonal MA1. And we observe another significant spike on lag 4 on acf for which we will choose a non-seasonal MA4.
Following this, we can think of a SARIMA(0,1,4)(0,1,1)[96]
fitx=auto.arima(power_train,stationary=TRUE, seasonal=TRUE)
fitx
## Series: power_train
## ARIMA(5,0,1) with non-zero mean
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ma1 mean
## 1.8800 -0.7717 -0.1742 0.0903 -0.0298 -0.9753 231.7521
## s.e. 0.0164 0.0345 0.0366 0.0346 0.0164 0.0032 1.0784
##
## sigma^2 estimated as 205.8: log likelihood=-15654.39
## AIC=31324.79 AICc=31324.83 BIC=31374.8
fitsam=arima(power_train,order=c(0,1,4),seasonal=list(order=c(0,1,1), period = 96))
fitsam
##
## Call:
## arima(x = power_train, order = c(0, 1, 4), seasonal = list(order = c(0, 1, 1),
## period = 96))
##
## Coefficients:
## ma1 ma2 ma3 ma4 sma1
## -0.2481 -0.0974 -0.1248 -0.3480 -0.8519
## s.e. 0.0161 0.0164 0.0184 0.0169 0.0094
##
## sigma^2 estimated as 54.44: log likelihood = -12837.35, aic = 25686.7
checkresiduals(fitsam)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,4)(0,1,1)[96]
## Q* = 333.7, df = 187, p-value = 2.34e-10
##
## Model df: 5. Total lags used: 192
prev_sa_m=forecast(fitsam, h=672)
plot(power_train,xlim=c(1,47))
lines(power_test,lty=2)
lines(prev_sa_m$mean,col=2)
RMSE Manual SARIMA without covariate:
rmse_sa_m= sqrt(mean((prev_sa_m$mean-power_test)^2))
rmse_sa_m
## [1] 14.74924
fitnn=nnetar(power_train, T=96)
print(fitnn)
## Series: power_train
## Model: NNAR(20,1,11)[96]
## Call: nnetar(y = power_train, T = 96)
##
## Average of 20 networks, each of which is
## a 21-11-1 network with 254 weights
## options were - linear output units
##
## sigma^2 estimated as 47.82
prevNN = forecast(fitnn, h=672)
autoplot(power_test)+
autolayer(prev$mean,series="Auto SARIMA without covariate")+
autolayer(prevNN$mean,series="NNAR")
RMSE NNAR:
rmse_NN=sqrt(mean((prevNN$mean-power_test)^2))
rmse_NN
## [1] 63.31427
We introduce the temperature as a covariate for the definition of the SARIMA model for Power.
fit_cov=auto.arima(power_train,xreg=temp_train)
prev_power_cov = forecast(fit_cov,h=672,xreg=temp_test)
autoplot(power_test) +
autolayer(prev_power_cov$mean, series = 'SARIMA with covariate')+
autolayer(prev$mean,series="Auto SARIMA without covariate")+
autolayer(prevNN$mean,series="NNAR")
fit_cov
## Series: power_train
## Regression with ARIMA(1,0,0)(0,1,0)[96] errors
##
## Coefficients:
## ar1 xreg
## 0.7744 0.3238
## s.e. 0.0104 0.2441
##
## sigma^2 estimated as 100.1: log likelihood=-13916.47
## AIC=27838.94 AICc=27838.94 BIC=27857.62
RMSE Auto SARIMA with covariate:
rmse_sa_co=sqrt(mean((prev_power_cov$mean-power_test)^2))
rmse_sa_co
## [1] 15.11528
checkresiduals(fit_cov)
##
## Ljung-Box test
##
## data: Residuals from Regression with ARIMA(1,0,0)(0,1,0)[96] errors
## Q* = 1498.5, df = 190, p-value < 2.2e-16
##
## Model df: 2. Total lags used: 192
There are still some auto-correlations on model residuals which suggest that residuals are not totally modeled.
We can try to find a better model manually.
We start by looking the relationship between Power and Temperature.
power_train2 = cbind(Production=power_train,Temp=temp_train)
model=tslm(Production~Temp+trend+season,data=power_train2)
summary(model)
##
## Call:
## tslm(formula = Production ~ Temp + trend + season, data = power_train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -112.381 -4.565 0.205 4.631 59.010
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.562e+02 2.100e+00 74.398 < 2e-16 ***
## Temp 1.158e+00 9.775e-02 11.849 < 2e-16 ***
## trend -4.875e-03 1.741e-04 -28.002 < 2e-16 ***
## season2 -6.668e-01 2.641e+00 -0.252 0.80070
## season3 -6.834e+00 2.641e+00 -2.587 0.00971 **
## season4 -3.391e-01 2.641e+00 -0.128 0.89785
## season5 2.848e+00 2.641e+00 1.078 0.28101
## season6 3.093e+00 2.625e+00 1.178 0.23869
## season7 -6.504e+00 2.625e+00 -2.478 0.01327 *
## season8 -6.572e+00 2.625e+00 -2.503 0.01234 *
## season9 -3.132e+00 2.625e+00 -1.193 0.23291
## season10 -4.103e+00 2.625e+00 -1.563 0.11814
## season11 -1.361e+00 2.625e+00 -0.518 0.60421
## season12 -2.156e+00 2.625e+00 -0.821 0.41155
## season13 -4.712e-01 2.625e+00 -0.179 0.85757
## season14 -5.117e+00 2.626e+00 -1.948 0.05144 .
## season15 -6.554e+00 2.626e+00 -2.496 0.01261 *
## season16 -8.845e-01 2.626e+00 -0.337 0.73627
## season17 5.479e-01 2.626e+00 0.209 0.83475
## season18 -3.644e-01 2.627e+00 -0.139 0.88969
## season19 -1.300e+00 2.627e+00 -0.495 0.62088
## season20 4.478e-01 2.627e+00 0.170 0.86466
## season21 -4.230e-02 2.627e+00 -0.016 0.98715
## season22 9.778e-01 2.628e+00 0.372 0.70988
## season23 2.015e+00 2.628e+00 0.767 0.44328
## season24 4.280e+00 2.628e+00 1.629 0.10350
## season25 3.832e+00 2.628e+00 1.458 0.14487
## season26 6.949e+00 2.628e+00 2.644 0.00823 **
## season27 1.378e+01 2.628e+00 5.244 1.66e-07 ***
## season28 1.581e+01 2.628e+00 6.016 1.96e-09 ***
## season29 1.598e+01 2.628e+00 6.081 1.31e-09 ***
## season30 2.186e+01 2.628e+00 8.318 < 2e-16 ***
## season31 2.165e+01 2.628e+00 8.236 2.43e-16 ***
## season32 1.624e+01 2.628e+00 6.179 7.16e-10 ***
## season33 1.991e+01 2.628e+00 7.577 4.44e-14 ***
## season34 1.040e+02 2.628e+00 39.580 < 2e-16 ***
## season35 1.017e+02 2.628e+00 38.721 < 2e-16 ***
## season36 9.907e+01 2.628e+00 37.700 < 2e-16 ***
## season37 9.873e+01 2.628e+00 37.572 < 2e-16 ***
## season38 1.014e+02 2.624e+00 38.633 < 2e-16 ***
## season39 9.647e+01 2.624e+00 36.758 < 2e-16 ***
## season40 9.893e+01 2.624e+00 37.695 < 2e-16 ***
## season41 9.960e+01 2.624e+00 37.951 < 2e-16 ***
## season42 9.584e+01 2.626e+00 36.504 < 2e-16 ***
## season43 9.657e+01 2.626e+00 36.781 < 2e-16 ***
## season44 9.821e+01 2.626e+00 37.407 < 2e-16 ***
## season45 9.858e+01 2.626e+00 37.546 < 2e-16 ***
## season46 1.005e+02 2.630e+00 38.219 < 2e-16 ***
## season47 9.857e+01 2.630e+00 37.478 < 2e-16 ***
## season48 9.924e+01 2.630e+00 37.734 < 2e-16 ***
## season49 1.000e+02 2.630e+00 38.037 < 2e-16 ***
## season50 9.941e+01 2.637e+00 37.695 < 2e-16 ***
## season51 1.028e+02 2.637e+00 38.962 < 2e-16 ***
## season52 1.019e+02 2.637e+00 38.647 < 2e-16 ***
## season53 1.005e+02 2.637e+00 38.098 < 2e-16 ***
## season54 1.020e+02 2.642e+00 38.597 < 2e-16 ***
## season55 1.023e+02 2.642e+00 38.724 < 2e-16 ***
## season56 1.015e+02 2.642e+00 38.419 < 2e-16 ***
## season57 1.009e+02 2.642e+00 38.200 < 2e-16 ***
## season58 1.010e+02 2.648e+00 38.149 < 2e-16 ***
## season59 1.013e+02 2.648e+00 38.274 < 2e-16 ***
## season60 1.010e+02 2.648e+00 38.132 < 2e-16 ***
## season61 1.008e+02 2.648e+00 38.062 < 2e-16 ***
## season62 1.005e+02 2.649e+00 37.955 < 2e-16 ***
## season63 1.002e+02 2.649e+00 37.814 < 2e-16 ***
## season64 1.002e+02 2.649e+00 37.821 < 2e-16 ***
## season65 1.001e+02 2.649e+00 37.790 < 2e-16 ***
## season66 1.005e+02 2.644e+00 38.031 < 2e-16 ***
## season67 1.010e+02 2.644e+00 38.201 < 2e-16 ***
## season68 9.845e+01 2.644e+00 37.239 < 2e-16 ***
## season69 9.845e+01 2.644e+00 37.237 < 2e-16 ***
## season70 1.198e+02 2.636e+00 45.456 < 2e-16 ***
## season71 1.337e+02 2.636e+00 50.734 < 2e-16 ***
## season72 1.450e+02 2.636e+00 54.998 < 2e-16 ***
## season73 1.443e+02 2.636e+00 54.755 < 2e-16 ***
## season74 1.420e+02 2.629e+00 54.012 < 2e-16 ***
## season75 1.407e+02 2.629e+00 53.500 < 2e-16 ***
## season76 1.398e+02 2.629e+00 53.166 < 2e-16 ***
## season77 1.402e+02 2.629e+00 53.316 < 2e-16 ***
## season78 1.459e+02 2.627e+00 55.547 < 2e-16 ***
## season79 1.419e+02 2.627e+00 54.022 < 2e-16 ***
## season80 1.406e+02 2.627e+00 53.511 < 2e-16 ***
## season81 1.391e+02 2.627e+00 52.926 < 2e-16 ***
## season82 1.391e+02 2.626e+00 52.981 < 2e-16 ***
## season83 1.368e+02 2.626e+00 52.100 < 2e-16 ***
## season84 1.367e+02 2.626e+00 52.052 < 2e-16 ***
## season85 1.357e+02 2.626e+00 51.657 < 2e-16 ***
## season86 1.337e+02 2.625e+00 50.938 < 2e-16 ***
## season87 1.326e+02 2.625e+00 50.525 < 2e-16 ***
## season88 1.315e+02 2.625e+00 50.087 < 2e-16 ***
## season89 1.293e+02 2.625e+00 49.243 < 2e-16 ***
## season90 1.123e+02 2.624e+00 42.776 < 2e-16 ***
## season91 1.109e+02 2.624e+00 42.267 < 2e-16 ***
## season92 1.048e+02 2.624e+00 39.949 < 2e-16 ***
## season93 1.060e+02 2.624e+00 40.374 < 2e-16 ***
## season94 3.007e+01 2.624e+00 11.457 < 2e-16 ***
## season95 3.221e+01 2.624e+00 12.274 < 2e-16 ***
## season96 3.037e+00 2.624e+00 1.157 0.24731
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.66 on 3737 degrees of freedom
## Multiple R-squared: 0.9597, Adjusted R-squared: 0.9586
## F-statistic: 917.3 on 97 and 3737 DF, p-value: < 2.2e-16
All features seems significant.
We check the residuals of the model:
ggtsdisplay(model$residuals)
There is an exponential decrease of the ACF and significant spike at lag
5 on the PACF. This looks like an AR5 model.
We can test the model:
tmp=model$residuals
fit3 = Arima(tmp,order=c(5,0,0))
checkresiduals(fit3)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,0,0) with non-zero mean
## Q* = 348.19, df = 186, p-value = 5.885e-12
##
## Model df: 6. Total lags used: 192
ggtsdisplay(fit3$residuals)
The residuals still does not look entirely like white noise. However,
looking at the significance level of the spikes on the acf/pacf plots
and the fact that other orders trials ended up in more important
auto-correlations, we continue with this model for the residuals.
Back to the entire model:
fit_sacom = Arima(power_train2[,"Production"],xreg=power_train2[,"Temp"],order=c(5,0,0),seasonal = c(0,1,0))
ggtsdisplay(fit_sacom$residuals)
There are still some significant auto-correlation: a spike at lag 96 on
the ACF and an exponential decrease on the PACF plot on seasonal lags.
We will model it with a seasonal MA1.
fit_sa_co_m = Arima(power_train2[,"Production"],xreg=power_train2[,"Temp"],order=c(5,0,0),seasonal = c(0,1,1))
ggtsdisplay(fit_sa_co_m$residuals)
The auto-correlations are less significant with this model. We will use
this model for forecasting.
prev_sa_co_m = forecast(fit_sa_co_m,h=672,xreg=temp_test)
autoplot(power_test)+autolayer(prev_sa_co_m$mean, series="Manual SARIMA with covariate")
RMSE Manual SARIMA with covariate:
rmse_sa_co_m = sqrt(mean((prev_sa_co_m$mean-power_test)^2))
rmse_sa_co_m
## [1] 14.61462
library(vars)
## Loading required package: MASS
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: urca
## Loading required package: lmtest
dataVar=cbind(power,temperature)
VARselect(dataVar,lag.max=96, type='const')$selection
## AIC(n) HQ(n) SC(n) FPE(n)
## 96 96 96 96
The best VAR parameter found is 96 in regard to best criterion selection.
We use it for the contruction of our VAR model:
var <- VAR(dataVar, p=96,type='const')
datatrain=cbind(power_train,temp_train)
var2 = VAR(datatrain, p=96)
fcst2=forecast(var2, h=672)
plot(fcst2)
RMSE for the VAR power:
rmse_var_p=sqrt(mean((power_test-fcst2$forecast$power_train$mean)^2))
rmse_var_p
## [1] 19.37745
autoplot(power_test)+
autolayer(prev_hw$mean,series='HW without covariate')+
autolayer(prev$mean,series="Auto SARIMA without covariate")+
autolayer(prev_sa_m$mean,series="Manual SARIMA without covariate")+
autolayer(prev_power_cov$mean, series='Auto SARIMA with covariate')+
autolayer(prevNN$mean,series="NNAR")+
autolayer(fcst2$forecast$power_train$mean,series='VAR')
Models <- c('HW', 'Auto SARIMA w/o covariate', 'Manual SARIMA w/o covariate', 'NNAR', 'Auto SARIMA w/ covariate', 'Manual SARIMA w/ covariate', 'VAR')
RMSE <- c(rmse_hw,rmse_sa, rmse_sa_m, rmse_NN, rmse_sa_co, rmse_sa_co_m,rmse_var_p)
error.data <- data.frame(Models,RMSE)
print(error.data)
## Models RMSE
## 1 HW 20.19866
## 2 Auto SARIMA w/o covariate 15.37603
## 3 Manual SARIMA w/o covariate 14.74924
## 4 NNAR 63.31427
## 5 Auto SARIMA w/ covariate 15.11528
## 6 Manual SARIMA w/ covariate 14.61462
## 7 VAR 19.37745
In regard to the values obtained for RMSE, the best model using outdoor temperature for forecasting is the manual SARIMA with covariate (SARIMA(5,0,0)(0,1,1)[96]) and the one not using outdoor temperature is the manual SARIMA without covariate (SARIMA(0,1,4)(0,1,1)[96]).
final_fit = Arima(power,order=c(0,1,4),seasonal=c(0,1,1))
final_prev=forecast(final_fit,h=96)
autoplot(final_prev)+
ggtitle('Forcasting of the power consumption of the 17th of Feb 2010')+
labs(y='Power consumption')
temp_final=ts(data$Temp, start=c(48,1), end=c(48,96),freq=96)
final_fit2 = Arima(power,order=c(5,0,0),seasonal=c(0,1,1), xreg=temperature)
final_prev2=forecast(final_fit2,xreg=temp_final,h=96)
autoplot(final_prev2)+
ggtitle('Forcasting of the power consumption of the 17th of Feb 2010 \n using outdoor Temperature')+
labs(y='Power consumption')
forecast_value <- cbind(final_prev$mean, final_prev2$mean)
write.xlsx(forecast_value, file = "CHERID.xlsx",
sheetName = "Forecast", row.names=FALSE, col.names=FALSE, append = FALSE)